knitr::opts_chunk$set(
  message = FALSE, 
  warning = FALSE, 
  tidy=FALSE,     # display code as typed
  size="small")   # slightly smaller font for code
options(digits = 3)

# default figure size
knitr::opts_chunk$set(
  fig.width=6.75, 
  fig.height=6.75,
  fig.align = "center"
)

1 Where Do People Drink The Most Beer, Wine And Spirits?

Using the ‘drinks’ data from the ‘fivethirtyeight’ package, we are analyzing the consumption of beer, wine and spirits in different countries.

# loading the package and dataset
library(fivethirtyeight)
data(drinks)

Viewing variable types

# reviewing all columns and data types in the dataset
glimpse(drinks)
## Rows: 193
## Columns: 5
## $ country                      <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings                <int> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings              <int> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings                <int> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
# using head and skim function to explore the data further
head(drinks)
## # A tibble: 6 x 5
##   country      beer_servings spirit_servings wine_servings total_litres_of_pure…
##   <chr>                <int>           <int>         <int>                 <dbl>
## 1 Afghanistan              0               0             0                   0  
## 2 Albania                 89             132            54                   4.9
## 3 Algeria                 25               0            14                   0.7
## 4 Andorra                245             138           312                  12.4
## 5 Angola                 217              57            45                   5.9
## 6 Antigua & B…           102             128            45                   4.9
skim(drinks)
Data summary
Name drinks
Number of rows 193
Number of columns 5
_______________________
Column type frequency:
character 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 3 28 0 193 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
beer_servings 0 1 106.16 101.14 0 20.0 76.0 188.0 376.0 ▇▃▂▂▁
spirit_servings 0 1 80.99 88.28 0 4.0 56.0 128.0 438.0 ▇▃▂▁▁
wine_servings 0 1 49.45 79.70 0 1.0 8.0 59.0 370.0 ▇▁▁▁▁
total_litres_of_pure_alcohol 0 1 4.72 3.77 0 1.3 4.2 7.2 14.4 ▇▃▅▃▁

As shown above, there are 5 variables with 3 variable types. ‘country’ is the only character variable and ‘total_litres_of_pure_alcohol’ is the only double variable (floating with two decimal places). The 3 integer variables are ‘beer_servimngs’, ‘spirit_servings’ and ‘wine_servings’. There seems to be no missing data. However there are 0 values assigned to 13 countries.

Plotting the top 25 beer consuming countries

# extracting the top 25 values for beer servings 
top_beer_countries <- drinks %>% 
  top_n(25,beer_servings)

# creating a bar plot in descending order of beer consumed in each country
ggplot(data = top_beer_countries, mapping = aes(x = beer_servings, y = reorder(country, beer_servings), fill = beer_servings)) +
  geom_col() +
  labs(title = "Top 25 Beer Drinking Countries", x = "Beer Consumption", y = "Country") +
  theme(legend.position = "none") +
  theme(plot.title = element_text(hjust = 0.5)) +
  NULL

Plotting the top 25 wine consuming countries

# extracting the top 25 values for wine servings 
top_wine_countries <- drinks %>%
  top_n(25,wine_servings)

# creating a bar plot in descending order of wine consumed in each country
ggplot(data = top_wine_countries, aes(x = wine_servings, y = reorder(country, wine_servings), fill = wine_servings)) +
  geom_col() +
  labs(title = "Top 25 Wine Drinking Countries", x = "Wine Consumption", y = "Country") +
  theme(legend.position = "none") +
  theme(plot.title = element_text(hjust = 0.5)) +
  NULL

Plotting the top 25 spirit consuming countries

# extracting the top 25 values for spirit servings 
top_spirit_countries <- drinks %>% 
  top_n(25,spirit_servings)

# creating a bar plot in descending order of spirits consumed in each country
ggplot(data = top_spirit_countries, aes(x = spirit_servings, y = reorder(country, spirit_servings), fill = spirit_servings)) +
  geom_col() +
  labs(title = "Top 25 Spirit Drinking Countries", x = "Spirit Consumption", y = "Country") + 
  theme(legend.position = "none") +
  theme(plot.title = element_text(hjust = 0.5)) +
  NULL

From viewing the graphs above, it is clear that geography affects the level of of consumption of wine, beer and spirits. In the top 25 beer and wine consuming countries, European countries are predominantly present. In contrast, spirits are more diverse with Asian countries such as Thailand, Japan, China and even Oceania making an appearance from the Cook Islands.

This could be due to both culture and climate of each country. Consumption seems to follow production. France, famously known for its wide variety of wine production, unsurprisingly is the top consumer of wine. Other countries that host favorable climate and cultures are also present near the top of the list- from Portugal to Italy. Furthermore, this trend is repeated in the consumption of beer with Germany and Ireland near the top of the list. Oktoberfest and Guinness, two respective cultural hallmarks of each country understandably explains this trend.

In contrast, spirits are more diverse, reflecting the variety of methods to create them from Russian potatoes to sugar cane in the Caribbean. The range between the top 25 beer drinking countries is far lower than both wine and in particular Grenada vs. other countries. This could show that people have less polarized taste to beer compared to wine and spirits.

2 Analysis of movies- IMDB dataset

A sample of movies will be analyzed from the IMDB 5000 movie dataset.

# The movies dataset is created taken from the csv file "movies.csv"
movies <- read_csv(here::here("data", "movies.csv"))

# viewing the dataset
glimpse(movies)
head(movies)
skim(movies)

As per the above, there appear to be no missing values, however there were 54 duplicate observations.

movies <- read_csv(here::here("data", "movies.csv"))
# removing duplicates via distinct
skim(distinct(movies,title, keep_all = FALSE))
Data summary
Name distinct(movies, title, k…
Number of rows 2907
Number of columns 2
_______________________
Column type frequency:
character 1
logical 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 83 0 2907 0

Variable type: logical

skim_variable n_missing complete_rate mean count
keep_all 0 1 0 FAL: 2907

Next, we create a table of the number of movies by genre.

movies_table <- movies %>% 
  count(genre, wt = NULL, sort = TRUE)

movies_table
## # A tibble: 17 x 2
##    genre           n
##    <chr>       <int>
##  1 Comedy        848
##  2 Action        738
##  3 Drama         498
##  4 Adventure     288
##  5 Crime         202
##  6 Biography     135
##  7 Horror        131
##  8 Animation      35
##  9 Fantasy        28
## 10 Documentary    25
## 11 Mystery        16
## 12 Sci-Fi          7
## 13 Family          3
## 14 Musical         2
## 15 Romance         2
## 16 Western         2
## 17 Thriller        1

Return on budget is calculated as the ratio between how much money a film made compared to the budget used during production

# calculating average return on budget for each genre and displaying results in a table
gross_budget <- movies %>% 
  group_by(genre) %>% 
  summarise(average_gross = mean(gross,na.rm = TRUE),
            average_budget = mean(budget,na.rm = TRUE)) %>% 
  mutate(return_on_budget = average_gross/average_budget) %>% 
  arrange(desc(return_on_budget))

gross_budget
## # A tibble: 17 x 4
##    genre       average_gross average_budget return_on_budget
##    <chr>               <dbl>          <dbl>            <dbl>
##  1 Musical         92084000        3189500          28.9    
##  2 Family         149160478.      14833333.         10.1    
##  3 Western         20821884        3465000           6.01   
##  4 Documentary     17353973.       5887852.          2.95   
##  5 Horror          37713738.      13504916.          2.79   
##  6 Fantasy         42408841.      17582143.          2.41   
##  7 Comedy          42630552.      24446319.          1.74   
##  8 Mystery         67533021.      39218750           1.72   
##  9 Animation       98433792.      61701429.          1.60   
## 10 Biography       45201805.      28543696.          1.58   
## 11 Adventure       95794257.      66290069.          1.45   
## 12 Drama           37465371.      26242933.          1.43   
## 13 Crime           37502397.      26596169.          1.41   
## 14 Romance         31264848.      25107500           1.25   
## 15 Action          86583860.      71354888.          1.21   
## 16 Sci-Fi          29788371.      27607143.          1.08   
## 17 Thriller            2468         300000           0.00823

Next, we analyse the gross earnings of the top 15 directors (ranked by total earnings of the movies they produced).

# calculating total gross earnings and mean, median and standard deviation by director
gross_directors <- movies %>% 
  group_by(director) %>% 
  summarise(total_gross = sum(gross,na.rm = TRUE), mean_gross = mean(gross,na.rm = TRUE), 
                              median_gross = median(gross,na.rm = TRUE), std_dev_gross = sd(gross,na.rm = TRUE)) %>% 
  top_n(15,total_gross) %>% 
  arrange(desc(total_gross))

gross_directors
## # A tibble: 15 x 5
##    director          total_gross mean_gross median_gross std_dev_gross
##    <chr>                   <dbl>      <dbl>        <dbl>         <dbl>
##  1 Steven Spielberg   4014061704 174524422.   164435221     101421051.
##  2 Michael Bay        2231242537 171634041.   138396624     127161579.
##  3 Tim Burton         2071275480 129454718.    76519172     108726924.
##  4 Sam Raimi          2014600898 201460090.   234903076     162126632.
##  5 James Cameron      1909725910 318287652.   175562880.    309171337.
##  6 Christopher Nolan  1813227576 226653447    196667606.    187224133.
##  7 George Lucas       1741418480 348283696    380262555     146193880.
##  8 Robert Zemeckis    1619309108 124562239.   100853835      91300279.
##  9 Clint Eastwood     1378321100  72543216.    46700000      75487408.
## 10 Francis Lawrence   1358501971 271700394.   281666058     135437020.
## 11 Ron Howard         1335988092 111332341    101587923      81933761.
## 12 Gore Verbinski     1329600995 189942999.   123207194     154473822.
## 13 Andrew Adamson     1137446920 284361730    279680930.    120895765.
## 14 Shawn Levy         1129750988 102704635.    85463309      65484773.
## 15 Ridley Scott       1128857598  80632686.    47775715      68812285.

We also review the ratings of all the movies by genre and create table with other details.

# calculating minimum, maximum, average, median and standard deviations of ratings by genre
ratings_table <- movies %>% 
  group_by(genre) %>% 
  summarise(max_rating = max(rating), min_rating = min(rating,na.rm = FALSE), mean_rating = mean(rating,na.rm = TRUE), 
            median_rating = median(rating,na.rm = TRUE), std_dev_rating = sd(rating,na.rm = TRUE)) %>%
  arrange(genre)

# displaying results in a table
ratings_table
## # A tibble: 17 x 6
##    genre       max_rating min_rating mean_rating median_rating std_dev_rating
##    <chr>            <dbl>      <dbl>       <dbl>         <dbl>          <dbl>
##  1 Action             9          2.1        6.23          6.3           1.03 
##  2 Adventure          8.6        2.3        6.51          6.6           1.09 
##  3 Animation          8          4.5        6.65          6.9           0.968
##  4 Biography          8.9        4.5        7.11          7.2           0.760
##  5 Comedy             8.8        1.9        6.11          6.2           1.02 
##  6 Crime              9.3        4.8        6.92          6.9           0.849
##  7 Documentary        8.5        1.6        6.66          7.4           1.77 
##  8 Drama              8.8        2.1        6.73          6.8           0.917
##  9 Family             7.9        5.7        6.5           5.9           1.22 
## 10 Fantasy            7.9        4.3        6.15          6.45          0.959
## 11 Horror             8.5        3.6        5.83          5.9           1.01 
## 12 Musical            7.2        6.3        6.75          6.75          0.636
## 13 Mystery            8.5        4.6        6.86          6.9           0.882
## 14 Romance            7.1        6.2        6.65          6.65          0.636
## 15 Sci-Fi             8.2        5          6.66          6.4           1.09 
## 16 Thriller           4.8        4.8        4.8           4.8          NA    
## 17 Western            7.3        4.1        5.70          5.70          2.26
# plotting a histogram showing the distribution of ratings by genre
ratings_plot <- movies %>% 
  ggplot(data = movies, mapping = aes(x = rating, fill = genre)) +
  geom_histogram(binwidth = 0.7) +
  facet_wrap(~genre) +
  labs(title = "Distribution of Ratings by Genre", x = "Rating", y = "Frequency") + 
  theme(legend.position = "none") +
  NULL

ratings_plot

To determine whether there is a relationship between number of facebook likes the cast of a movie receives and the gross earnings of that movie, we plot the data as below -

# constructing a scatter plot 
  ggplot(data = movies, mapping = aes(x = cast_facebook_likes, y = gross)) +
  geom_point(alpha = 0.2) +
  labs(title = "Relationship Between Gross Earnings and Cast Facebook Likes ", x = "Cast Facebook Likes", y = "Gross") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_continuous(limits = c(0,200000)) +
  scale_y_continuous() +
  geom_smooth() +
  NULL

# Alpha of 0.2 is used to see where there is a cluster of data points on the plot. As there are a few outliers, the x variable of Cast Facebook Likes is restricted from 0 to 200,000.

Examining the plot, we can conclude that that Cast Facebook Likes is not a good predictor of gross movie revenue. There is no clear trend with a varying amount of movie revenue earned per amount of Facebook Cast Likes.

Let’s see if another variable portrays better correlation with gross revenue.

# creating a scatter plot for budget and gross revenue 
ggplot(data = movies, mapping = aes(x = budget, y = gross)) +
  geom_point(alpha = 0.2) +
  labs(title = "Relationship Between Gross Earnings and Budget ", x = "Budget", y = "Gross") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_smooth() +
  NULL

As seen in the plot above, budget is a better indicator to determine gross revenue for each movie, however it is far from being a perfect indicator. For each budget value, there is a wide spread of values for gross revenue. However, it is clear that with a low budget, a movie is likely to have low gross revenue.

Let us also check if ratings are a good indicator of gross revenue a movie earns.

# plotting ratings and gross earnings of movies faceted by genre
movies %>% 
ggplot(data = movies, mapping = aes(x = rating, y = gross)) +
  geom_point(alpha = 0.2) +
  labs(title = "Relationship Between Gross Earnings and Ratings ", x = "Rating", y = "Gross") + 
  geom_smooth() +
  facet_wrap(~genre) +
  scale_y_continuous(labels = scales::comma) +
  NULL

As seen above there are varying relationships developed for IMDB rating and gross revenue for each genre.

Genres that have the largest data such as action, comedy and drama show that as ratings increase, there is more gross revenue. Action in particular highlights this relationship. However there are a few genres with limited data points where no relationship can be established such as Musical, Romance, Thriller and Western.

We think this such correlation and lack of data points also has something to do with the popularity of each genre. In less popular genres such as Documentary, irrespective of the ratings, the gross earnings remain more or less constant. In more popular genres such as Action and Adventure, higher ratings are relatively more correlated with earnings.

3 Returns of Financial Stocks

# reading the csv file and assigning it to the variable nyse
nyse <- read_csv(here::here("data","nyse.csv"))
nyse
## # A tibble: 508 x 6
##    symbol name          ipo_year sector    industry           summary_quote     
##    <chr>  <chr>         <chr>    <chr>     <chr>              <chr>             
##  1 MMM    3M Company    n/a      Health C… Medical/Dental In… https://www.nasda…
##  2 ABB    ABB Ltd       n/a      Consumer… Electrical Produc… https://www.nasda…
##  3 ABT    Abbott Labor… n/a      Health C… Major Pharmaceuti… https://www.nasda…
##  4 ABBV   AbbVie Inc.   2012     Health C… Major Pharmaceuti… https://www.nasda…
##  5 ACN    Accenture plc 2001     Miscella… Business Services  https://www.nasda…
##  6 AAP    Advance Auto… n/a      Consumer… Other Specialty S… https://www.nasda…
##  7 AFL    Aflac Incorp… n/a      Finance   Accident &Health … https://www.nasda…
##  8 A      Agilent Tech… 1999     Capital … Biotechnology: La… https://www.nasda…
##  9 AEM    Agnico Eagle… n/a      Basic In… Precious Metals    https://www.nasda…
## 10 APD    Air Products… n/a      Basic In… Major Chemicals    https://www.nasda…
## # … with 498 more rows

To get an idea of the distribution of companies by sector, let’s assemble them in the form of a bar plot.

# counting companies per sector
companies_by_sector <- nyse %>% 
  count(sector, wt = NULL, sort = TRUE) %>% 
  rename(no_of_companies = n)

# plotting the above result
ggplot(data = companies_by_sector, aes(x = no_of_companies, y = reorder(sector, no_of_companies), fill = no_of_companies)) +
  geom_col() +
  labs(title = "Companies by Sector", x = "Number of Companies", y = "Sector") +
  theme(legend.position = "none") +
  theme(plot.title = element_text(hjust = 0.5)) +
NULL

companies_by_sector
## # A tibble: 12 x 2
##    sector                no_of_companies
##    <chr>                           <int>
##  1 Finance                            97
##  2 Consumer Services                  79
##  3 Public Utilities                   60
##  4 Capital Goods                      45
##  5 Health Care                        45
##  6 Energy                             42
##  7 Technology                         40
##  8 Basic Industries                   39
##  9 Consumer Non-Durables              31
## 10 Miscellaneous                      12
## 11 Transportation                     10
## 12 Consumer Durables                   8

Next, we choose a portfolio of 6 companies as per our preference and SPY.

# extracting data specific to chosen companies
myStocks <- c("BABA", "T", "BA", "C", "DEO", "TWTR","SPY" ) %>%
  tq_get(get  = "stock.prices",
         from = "2011-01-01",
         to   = "2020-08-31") %>%
  group_by(symbol) 

glimpse(myStocks)
## Rows: 15,366
## Columns: 8
## Groups: symbol [7]
## $ symbol   <chr> "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BAB…
## $ date     <date> 2014-09-19, 2014-09-22, 2014-09-23, 2014-09-24, 2014-09-25,…
## $ open     <dbl> 92.7, 92.7, 88.9, 88.5, 91.1, 89.7, 89.6, 89.0, 88.7, 86.3, …
## $ high     <dbl> 99.7, 92.9, 90.5, 90.6, 91.5, 90.5, 89.7, 90.9, 88.9, 88.2, …
## $ low      <dbl> 89.9, 89.5, 86.6, 87.2, 88.5, 88.7, 88.0, 88.5, 86.0, 85.6, …
## $ close    <dbl> 93.9, 89.9, 87.2, 90.6, 88.9, 90.5, 88.8, 88.8, 86.1, 87.1, …
## $ volume   <dbl> 2.72e+08, 6.67e+07, 3.90e+07, 3.21e+07, 2.86e+07, 1.83e+07, …
## $ adjusted <dbl> 93.9, 89.9, 87.2, 90.6, 88.9, 90.5, 88.8, 88.8, 86.1, 87.1, …

Let us calculate daily, monthly and yearly returns on the portfolio that we have chosen above.

#calculating daily returns
myStocks_returns_daily <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "daily", 
               type       = "log",
               col_rename = "daily_returns",
               cols = c(nested.col))  

myStocks_returns_daily
## # A tibble: 15,366 x 3
## # Groups:   symbol [7]
##    symbol date       daily_returns
##    <chr>  <date>             <dbl>
##  1 BABA   2014-09-19       0      
##  2 BABA   2014-09-22      -0.0435 
##  3 BABA   2014-09-23      -0.0307 
##  4 BABA   2014-09-24       0.0383 
##  5 BABA   2014-09-25      -0.0184 
##  6 BABA   2014-09-26       0.0172 
##  7 BABA   2014-09-29      -0.0191 
##  8 BABA   2014-09-30       0.00113
##  9 BABA   2014-10-01      -0.0314 
## 10 BABA   2014-10-02       0.0111 
## # … with 15,356 more rows
#calculating monthly  returns
myStocks_returns_monthly <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "monthly", 
               type       = "arithmetic",
               col_rename = "monthly_returns",
               cols = c(nested.col)) 

myStocks_returns_monthly
## # A tibble: 734 x 3
## # Groups:   symbol [7]
##    symbol date       monthly_returns
##    <chr>  <date>               <dbl>
##  1 BABA   2014-09-30         -0.0537
##  2 BABA   2014-10-31          0.110 
##  3 BABA   2014-11-28          0.132 
##  4 BABA   2014-12-31         -0.0690
##  5 BABA   2015-01-30         -0.143 
##  6 BABA   2015-02-27         -0.0445
##  7 BABA   2015-03-31         -0.0221
##  8 BABA   2015-04-30         -0.0234
##  9 BABA   2015-05-29          0.0988
## 10 BABA   2015-06-30         -0.0789
## # … with 724 more rows
#calculating yearly returns
myStocks_returns_annual <- myStocks %>%
  group_by(symbol) %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "yearly", 
               type       = "arithmetic",
               col_rename = "yearly_returns",
               cols = c(nested.col))

myStocks_returns_annual
## # A tibble: 65 x 3
## # Groups:   symbol [7]
##    symbol date       yearly_returns
##    <chr>  <date>              <dbl>
##  1 BABA   2014-12-31         0.107 
##  2 BABA   2015-12-31        -0.218 
##  3 BABA   2016-12-30         0.0805
##  4 BABA   2017-12-29         0.964 
##  5 BABA   2018-12-31        -0.205 
##  6 BABA   2019-12-31         0.547 
##  7 BABA   2020-08-28         0.363 
##  8 T      2011-12-30         0.0796
##  9 T      2012-12-31         0.175 
## 10 T      2013-12-31         0.0972
## # … with 55 more rows

Further, we analyse the monthly returns of our portflio by calculating minimum, maximum, mean, median and standard deviation

# creating a summary of monthly returns
summary_monthly_returns <- myStocks_returns_monthly %>% 
  group_by(symbol) %>% 
  summarise(min_return = min(monthly_returns), max_return = max(monthly_returns), 
            median_return = median(monthly_returns), 
            mean_return = mean(monthly_returns), sd_return = sd(monthly_returns))


summary_monthly_returns
## # A tibble: 7 x 6
##   symbol min_return max_return median_return mean_return sd_return
##   <chr>       <dbl>      <dbl>         <dbl>       <dbl>     <dbl>
## 1 BA         -0.458     0.257        0.0203      0.0145     0.0862
## 2 BABA       -0.196     0.422        0.0101      0.0209     0.105 
## 3 C          -0.336     0.238        0.00677     0.00568    0.0901
## 4 DEO        -0.104     0.0956       0.0134      0.00865    0.0435
## 5 SPY        -0.125     0.127        0.0146      0.0112     0.0381
## 6 T          -0.172     0.104        0.00406     0.00586    0.0476
## 7 TWTR       -0.274     0.531        0.0107      0.00984    0.150

Next, we construct a density plot for each stock chosen, including the ETF S&P 500 to assess how risky each one of them is.

# plotting monthly returns faceted by the symbol of each stock
ggplot(data = myStocks_returns_monthly, aes(x = monthly_returns)) +
  geom_density() +
  facet_wrap(~symbol) +
  labs(title = "Monthly Return Density Plot", x = "Monthly Return", y = "Frequency") +
  NULL

As seen from the above plot, Twitter (TWTR) is the riskiest as it has the greatest variance in monthly return. The least risky is the S&P 500 (SPY) as the monthly returns have less variance. This is as predicted due to the nature of an ETF being a consolidation of other stocks and therefore it will have less inherent risk.

We also create a Risk vs Return plot for expected monthly returns on each stock

# plotting risk on x axis and expected monthly return on y axis
ggplot(data = summary_monthly_returns, mapping = aes( x = sd_return, y = mean_return)) +
  geom_point() +
  labs(title = "Risk vs Return Plot", x = "Risk", y = "Expected Monthly Return") +
  ggrepel::geom_text_repel(aes( x = sd_return, y = mean_return, label = symbol)) +
NULL

From the above plot it is clear that some companies have more return per given amount of risk. Overall, both CitiGroup (C) and Twitter (TWTR) have relatively high risk compared to the modest level of returns they offer. A risk-averse investor would prefer Boeing (BA) in comparison to CitiGroup (C) as it is slightly less risky and offers nearly 3 times more (expected) returns. Similarly, between Twitter (TWTR) and Diageo (DEO) both of which have approximately the same amount of expected returns, a risk-averse investor would prefer Diageo as it is relatively risk-free.

4 IBM HR Analytics

# loading and viewing the HR dataset
hr_dataset <- read_csv(here::here("data", "datasets_1067_1925_WA_Fn-UseC_-HR-Employee-Attrition.csv"))
glimpse(hr_dataset)
## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, …
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", …
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Trave…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358…
## $ Department               <chr> "Sales", "Research & Development", "Research…
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26,…
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3,…
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3,…
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", …
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, …
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2,…
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1,…
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "La…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3,…
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "M…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 26…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 996…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5,…
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes"…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, …
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3,…
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2,…
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, …
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0,…
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, …
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4,…
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3,…
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2,…
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0,…
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3,…
# clean the dataset by assigning numerical data to more descriptive data.

hr_cleaned <- hr_dataset %>% 
  clean_names() %>% 
  mutate(
    education = case_when(
      education == 1 ~ "Below College",
      education == 2 ~ "College",
      education == 3 ~ "Bachelor",
      education == 4 ~ "Master",
      education == 5 ~ "Doctor"
    ),
    environment_satisfaction = case_when(
      environment_satisfaction == 1 ~ "Low",
      environment_satisfaction == 2 ~ "Medium",
      environment_satisfaction == 3 ~ "High",
      environment_satisfaction == 4 ~ "Very High"
    ),
    job_satisfaction = case_when(
      job_satisfaction == 1 ~ "Low",
      job_satisfaction == 2 ~ "Medium",
      job_satisfaction == 3 ~ "High",
      job_satisfaction == 4 ~ "Very High"
    ),
    performance_rating = case_when(
      performance_rating == 1 ~ "Low",
      performance_rating == 2 ~ "Good",
      performance_rating == 3 ~ "Excellent",
      performance_rating == 4 ~ "Outstanding"
    ),
    work_life_balance = case_when(
      work_life_balance == 1 ~ "Bad",
      work_life_balance == 2 ~ "Good",
      work_life_balance == 3 ~ "Better",
      work_life_balance == 4 ~ "Best"
    )
  ) %>% 
  select(age, attrition, daily_rate, department,
         distance_from_home, education,
         gender, job_role,environment_satisfaction,
         job_satisfaction, marital_status,
         monthly_income, num_companies_worked, percent_salary_hike,
         performance_rating, total_working_years,
         work_life_balance, years_at_company,
         years_since_last_promotion)
hr_cleaned
## # A tibble: 1,470 x 19
##      age attrition daily_rate department distance_from_h… education gender
##    <dbl> <chr>          <dbl> <chr>                 <dbl> <chr>     <chr> 
##  1    41 Yes             1102 Sales                     1 College   Female
##  2    49 No               279 Research …                8 Below Co… Male  
##  3    37 Yes             1373 Research …                2 College   Male  
##  4    33 No              1392 Research …                3 Master    Female
##  5    27 No               591 Research …                2 Below Co… Male  
##  6    32 No              1005 Research …                2 College   Male  
##  7    59 No              1324 Research …                3 Bachelor  Female
##  8    30 No              1358 Research …               24 Below Co… Male  
##  9    38 No               216 Research …               23 Bachelor  Male  
## 10    36 No              1299 Research …               27 Bachelor  Male  
## # … with 1,460 more rows, and 12 more variables: job_role <chr>,
## #   environment_satisfaction <chr>, job_satisfaction <chr>,
## #   marital_status <chr>, monthly_income <dbl>, num_companies_worked <dbl>,
## #   percent_salary_hike <dbl>, performance_rating <chr>,
## #   total_working_years <dbl>, work_life_balance <chr>, years_at_company <dbl>,
## #   years_since_last_promotion <dbl>

Below are some calculations to better understand the hr_cleaned dataset.

# calculating attrition rate
attrition_rate <- hr_cleaned %>% 
  count(attrition) 
  
attrition_rate
## # A tibble: 2 x 2
##   attrition     n
##   <chr>     <int>
## 1 No         1233
## 2 Yes         237

Out of all the employees, 237 employees left and 1233 employees stayed. So the attrition rate is roughly 16.12%

Next, we try to understand the causal factors behind such an attrition rate by examining the different variables that could’ve influenced it.

# view summary of 'age', 'years_at_company', 'monthly_income' and 'years_since_last_promotion'. 
summary(hr_cleaned$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    30.0    36.0    36.9    43.0    60.0
summary(hr_cleaned$years_at_company)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       3       5       7       9      40
summary(hr_cleaned$monthly_income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1009    2911    4919    6503    8379   19999
summary(hr_cleaned$years_since_last_promotion)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00    2.19    3.00   15.00
# create a histogram to see the age distribution of all employees
hr_plot_age <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = age, fill = age)) +
  geom_histogram(binwidth = 2) +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency") + 
  theme(legend.position = "none") +
  NULL

hr_plot_age

# create a histogram to see the distribution of employment tenure - years for which employees have worked at IBM
hr_plot_years <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = years_at_company, fill = years_at_company)) +
  geom_histogram(binwidth = 2) +
  labs(title = "Distribution of Years at Company", x = "Years at Company", y = "Frequency") + 
  theme(legend.position = "none") +
  NULL

hr_plot_years

# create a histogram to see the distribution of monthly income for all employees
hr_plot_income <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = monthly_income, fill = monthly_income)) +
  geom_histogram(binwidth = 200) +
  labs(title = "Distribution of Monthly Income", x = "Monthly Income", y = "Frequency") + 
  theme(legend.position = "none") +
  NULL

hr_plot_income

# create a histogram to see the distribution of how many years have passed since employees were last promoted
hr_plot_last_promotion <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = years_since_last_promotion, fill = years_since_last_promotion)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Distribution of Years Since Last Promotion", x = "Years Since Last Promotion", y = "Frequency") + 
  theme(legend.position = "none") +
  NULL

hr_plot_last_promotion

To make things easier, we also order the descriptive variables in ‘job_satisfaction’ in an appropriate manner and calculate percentage for each of them.

hr_cleaned$job_satisfaction <- factor(hr_cleaned$job_satisfaction,levels  = c("Low", "Medium", "High", "Very High"))

job_satisfaction_percent <- hr_cleaned %>% 
  count(job_satisfaction) 
  
job_satisfaction_percent
## # A tibble: 4 x 2
##   job_satisfaction     n
##   <fct>            <int>
## 1 Low                289
## 2 Medium             280
## 3 High               442
## 4 Very High          459
job_satisfaction_percent %>%
  mutate(n/sum(n)*100)
## # A tibble: 4 x 3
##   job_satisfaction     n `n/sum(n) * 100`
##   <fct>            <int>            <dbl>
## 1 Low                289             19.7
## 2 Medium             280             19.0
## 3 High               442             30.1
## 4 Very High          459             31.2
# plot a histogram for job satisfaction
hr_plot_satisfaction <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = job_satisfaction)) +
  geom_bar(binwidth = 1) +
  labs(title = "Distribution of Job Satisfaction", x = "Job Satisfaction", y = "Frequency") + 
  theme(legend.position = "none") +
  annotate("text", x = 1:2:3:4, y=1000:1000:1000:1000, label = c("19.7%", "19.0%", "30.1%", "31.2%")) +
  NULL

hr_plot_satisfaction

In addition, we also order the descriptive variables in ‘work_life_balance’ appropriately

hr_cleaned$work_life_balance <- factor(hr_cleaned$work_life_balance,levels  = c("Bad", "Good", "Better", "Best"))

work_life_percent <- hr_cleaned %>% 
  count(work_life_balance) 
  
work_life_percent
## # A tibble: 4 x 2
##   work_life_balance     n
##   <fct>             <int>
## 1 Bad                  80
## 2 Good                344
## 3 Better              893
## 4 Best                153
work_life_percent %>%
  mutate(n/sum(n)*100)
## # A tibble: 4 x 3
##   work_life_balance     n `n/sum(n) * 100`
##   <fct>             <int>            <dbl>
## 1 Bad                  80             5.44
## 2 Good                344            23.4 
## 3 Better              893            60.7 
## 4 Best                153            10.4
# plot a histogram for distribution of work life balance across employees
hr_plot_worklife <- hr_cleaned %>% 
  ggplot(data = hr_cleaned, mapping = aes(x = work_life_balance)) +
  geom_bar(binwidth = 1) +
  labs(title = "Distribution of Work Life Balance", x = "Work Life Balance", y = "Frequency") + 
  theme(legend.position = "none") +
  annotate("text", x = 1:2:3:4, y=1000:1000:1000:1000, label = c("5.44%", "23.4%", "60.7%", "10.4%")) +
  NULL

hr_plot_worklife

Next, we examine the relationship between monthly income and other variables such as level of education, gender, job role, etc.

# plot for monthly income and education
ggplot(data = hr_cleaned, mapping = aes( x = reorder(education, -monthly_income, FUN = median), y = monthly_income)) +
  geom_boxplot() +
  labs(title = "Relationship Between Monthly Income and Education", 
       x = "Education", y = " Monthly Income") +
NULL

# plot for monthly income and gender
ggplot(data = hr_cleaned, mapping = aes( x = gender, y = monthly_income)) +
  geom_boxplot() +
  labs(title = "Relationship Between Monthly Income and Gender", 
       x = "Gender", y = " Monthly Income") +
NULL

# plot for monthly income and job role
ggplot(data = hr_cleaned, mapping = aes(x = reorder(job_role, -monthly_income), y = monthly_income)) +
  geom_boxplot() +
  labs(title = "Relationship Between Monthly Income and Job Role", 
       x = "Job Role", y = " Monthly Income") +
  theme(axis.text.x = element_text(angle=60, hjust=1)) +
NULL

We calculate the median of monthly income by education level, and plot a bar chart to study the same.

# calculate median 
median_monthly_income <- hr_cleaned %>% 
  group_by(education) %>% 
  mutate(monthly_income, median(monthly_income))
median_monthly_income
## # A tibble: 1,470 x 20
## # Groups:   education [5]
##      age attrition daily_rate department distance_from_h… education gender
##    <dbl> <chr>          <dbl> <chr>                 <dbl> <chr>     <chr> 
##  1    41 Yes             1102 Sales                     1 College   Female
##  2    49 No               279 Research …                8 Below Co… Male  
##  3    37 Yes             1373 Research …                2 College   Male  
##  4    33 No              1392 Research …                3 Master    Female
##  5    27 No               591 Research …                2 Below Co… Male  
##  6    32 No              1005 Research …                2 College   Male  
##  7    59 No              1324 Research …                3 Bachelor  Female
##  8    30 No              1358 Research …               24 Below Co… Male  
##  9    38 No               216 Research …               23 Bachelor  Male  
## 10    36 No              1299 Research …               27 Bachelor  Male  
## # … with 1,460 more rows, and 13 more variables: job_role <chr>,
## #   environment_satisfaction <chr>, job_satisfaction <fct>,
## #   marital_status <chr>, monthly_income <dbl>, num_companies_worked <dbl>,
## #   percent_salary_hike <dbl>, performance_rating <chr>,
## #   total_working_years <dbl>, work_life_balance <fct>, years_at_company <dbl>,
## #   years_since_last_promotion <dbl>, `median(monthly_income)` <dbl>
# plot bar chart for median monthly income
ggplot(data = hr_cleaned, mapping = aes(x = reorder(education, monthly_income), y = median(monthly_income))) +
  geom_col() +
NULL

# The distribution of income is created and faceted by education level
ggplot(data = hr_cleaned, mapping = aes( x = monthly_income)) +
  geom_histogram() +
  facet_wrap(~education) +
  labs(title = "Distribution of Income by Education Level", 
       x = "Monthly Income", y = "Frequency") +
  theme_bw() +
NULL

# The relationship between monthly income and age is identified and faceted by profession
ggplot(data = hr_cleaned, mapping = aes(x = monthly_income, y = age)) +
  geom_point() +
  facet_wrap(~job_role) +
  labs(title = "Relationship Between Income and Age by Job Role", 
       x = "Income", y = "Age") +
  theme_bw() +
  geom_smooth(se = FALSE) +
NULL

The above plots highlight relationships between different variables in the dataset hr_cleaned.

As seen in the dataset, the data provided from IBM shows an attrition rate of around 16%, indicating 84% of employees remained with the firm.

The variables of ‘age’, ‘years at company’, ‘monthly income’ and ‘years since last promotion’ are further analyzed to understand their distribution. The summary statistics alone can give us a good indication on if the distribution is not normal. However, it is hard to assert if the distribution is normal only from this data. In this case, by looking at the summary data the distributions of years at company / income / years since last promotion were not normal. However, to understand if age was a normal distribution, a histogram plot was required.

Job satisfaction and work life balance were two variables that had distinct descriptive values. Plotting the distribution of job satisfaction showed that roughly two thirds of employees either had high or very high levels of job satisfaction. Just over one third had either medium or low levels of job satisfaction. Work life balance was far more fairly distributed with around three fifths of all employees stating they had better levels of work life balance and only one fifth claiming either the best or the worst work life balances.

Other variables of gender, job role and education level all influence the difference in income level for employees. The dataset shows that with a higher education status and job position, the employee is likely to have a larger monthly income. Females are seen to have a slightly larger median monthly income compared to males. A general trend of higher age correlates to higher income, which is reflected in all positions in the firm.

5 Challenge 1: Replicating a chart

6 Challenge 2: 2016 California Contributors plots

To get this plot, we must join two dataframes; the one you have with all contributions, and data that can translate zipcodes to cities.

# Make sure you use vroom() as it is significantly faster than read.csv()
CA_contributors_2016 <- vroom::vroom(here::here("data","CA_contributors_2016.csv"))

# loading patchwork to combine plots
library(patchwork)

# reading first dataset
data_1 <- read.csv(here::here("data", "zip_code_database.csv")) 

# reading second dataset
data_2 <- read.csv(here::here("data", "CA_contributors_2016.csv")) 

# merging both datasets
data_3 <- merge(data_1, data_2, by=c("zip")) %>% select(zip, primary_city, cand_nm, contb_receipt_amt)

To replicate the above plots, we extract data specific to Hillary Clinton and Donald Trump for the top 10 cities where they raised money, as below

# extracting data for Hillary's plot
data_clinton <- data_3 %>% 
  filter(cand_nm == "Clinton, Hillary Rodham") %>% 
  group_by(primary_city) %>% 
  summarise(tot1 = sum(contb_receipt_amt)) %>% 
  top_n(10, tot1)

# extracting data for Trump's plot
data_trump <- data_3 %>% 
  filter(cand_nm == "Trump, Donald J.") %>% 
  group_by(primary_city) %>% 
  summarise(tot2 = sum(contb_receipt_amt)) %>%  
  top_n(10, tot2) 

# plotting the data for both 
plot_clinton <- ggplot(data_clinton, aes(x = tot1, y = reorder(primary_city,tot1))) + 
                geom_col(width = 0.9, fill = "dodgerblue3", size = 5) +
                labs(title = "Clinton, Hillary Rodham", x = "", y = "") +
                theme_bw() + 
                #Selected a theme that closely matched the original plot
                scale_x_continuous(labels = scales::dollar_format()) +
                #Changed the scales to make them display the dollar signs
NULL
                      
plot_trump <- ggplot(data_trump, aes(x=tot2, y=reorder(primary_city,tot2))) + 
                      geom_col(width = 0.9, fill = "firebrick") +
              labs(title = "Trump, Donald J.", x = NULL, y = NULL) +
              theme_bw() +
              scale_x_continuous(labels = scales::dollar_format()) +
NULL

# using patchwork to display both plots together
patchwork <- plot_clinton + plot_trump
patchwork + plot_annotation( title = "Where did candidates raise most money?",
                             caption = "Amount raised") +
            NULL

If we were to create the same plot for Top 10 candidates instead of just Hillary and Trump

library(tidytext)

top_10_candidates <- data_3 %>%
  group_by(cand_nm) %>%
  summarise(total = sum(contb_receipt_amt)) %>%
  top_n(10) %>%
  ungroup()
#Here we create a new data frame that tells us which 10 candidates raised the most amount of money

data_4 <- data_3 %>%
  filter(cand_nm %in% top_10_candidates$cand_nm) %>%
  group_by(cand_nm, primary_city) %>%
  summarise(total_per_city = sum(contb_receipt_amt)) %>%
  top_n(10, total_per_city) %>%
  ungroup() %>%
  mutate(cand_nm = as.factor(cand_nm),
         primary_city = reorder_within(primary_city, total_per_city, cand_nm)) %>%
#Discovered which 10 cities funded the top 10 candidate (from before) the most
  ggplot(aes(primary_city, total_per_city, fill = cand_nm)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~cand_nm, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(y = "",
       x = " ",
       title = "Where Did 2016 Top 10 Candidates' Contributions Come From") +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12)
        )
#Plot the data and change some of its aesthetics
data_4

7 Details

  • Who did you collaborate with: Alessandro Angeletti, Zichen Wang, Johanna Jeffery, Nitya Chopra and Christopher Lewis
  • Approximately how much time did you spend on this problem set: Approximately 15h
  • What, if anything, gave you the most trouble:
  1. Figuring out when to use “color” and when to use “colour”; and
  2. The difference between “==” and “%in%”.